{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "from bs4 import BeautifulSoup\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer as TF\n",
    "from sklearn.naive_bayes import MultinomialNB as MNB\n",
    "from sklearn.linear_model import LogisticRegression as LR\n",
    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
    "from sklearn.model_selection import cross_val_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text(origin_text):\n",
    "    # 去掉html标签\n",
    "    text = BeautifulSoup(origin_text).get_text()\n",
    "    # 去掉标点符号和非法字符\n",
    "    text = re.sub(\"[^a-zA-Z]\", \" \", text)\n",
    "    # 将字符全部转化为小写，并通过空格符进行分词处理\n",
    "    words = text.lower().split()\n",
    "    # 去停用词\n",
    "    stop_words = set(stopwords.words(\"english\"))\n",
    "    meaningful_words = [w for w in words if w not in stop_words]\n",
    "    # 将剩下的词还原成str类型\n",
    "    cleaned_text = \" \".join(meaningful_words)\n",
    "    return cleaned_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>review_date</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>is_spoiler</th>\n",
       "      <th>review_text</th>\n",
       "      <th>rating</th>\n",
       "      <th>review_summary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>23 September 2013</td>\n",
       "      <td>tt1392214</td>\n",
       "      <td>ur38303522</td>\n",
       "      <td>True</td>\n",
       "      <td>The film Prisoners from the outside looked lik...</td>\n",
       "      <td>10</td>\n",
       "      <td>\"Prisoners\" Better than expected</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2 October 2006</td>\n",
       "      <td>tt0095327</td>\n",
       "      <td>ur12460427</td>\n",
       "      <td>False</td>\n",
       "      <td>This is an animated story about a victim and h...</td>\n",
       "      <td>10</td>\n",
       "      <td>Great tearjerker</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2 May 2017</td>\n",
       "      <td>tt5311514</td>\n",
       "      <td>ur75844575</td>\n",
       "      <td>False</td>\n",
       "      <td>I am going to keep this short but your name is...</td>\n",
       "      <td>10</td>\n",
       "      <td>it is amazing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22 November 2012</td>\n",
       "      <td>tt0017136</td>\n",
       "      <td>ur29659325</td>\n",
       "      <td>False</td>\n",
       "      <td>I would have given this movie more stars if it...</td>\n",
       "      <td>1</td>\n",
       "      <td>visuals riveting</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>21 December 2004</td>\n",
       "      <td>tt0119094</td>\n",
       "      <td>ur0263096</td>\n",
       "      <td>False</td>\n",
       "      <td>I mean come on. I thought this story was great...</td>\n",
       "      <td>8</td>\n",
       "      <td>Calm Down People - Give This Picture A Break</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         review_date   movie_id     user_id  is_spoiler  \\\n",
       "0  23 September 2013  tt1392214  ur38303522        True   \n",
       "1     2 October 2006  tt0095327  ur12460427       False   \n",
       "2         2 May 2017  tt5311514  ur75844575       False   \n",
       "3   22 November 2012  tt0017136  ur29659325       False   \n",
       "4   21 December 2004  tt0119094   ur0263096       False   \n",
       "\n",
       "                                         review_text  rating  \\\n",
       "0  The film Prisoners from the outside looked lik...      10   \n",
       "1  This is an animated story about a victim and h...      10   \n",
       "2  I am going to keep this short but your name is...      10   \n",
       "3  I would have given this movie more stars if it...       1   \n",
       "4  I mean come on. I thought this story was great...       8   \n",
       "\n",
       "                                 review_summary  \n",
       "0              \"Prisoners\" Better than expected  \n",
       "1                              Great tearjerker  \n",
       "2                                 it is amazing  \n",
       "3                              visuals riveting  \n",
       "4  Calm Down People - Give This Picture A Break  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.read_json('../data/train.json')\n",
    "test_df = pd.read_json('../data/test.json')\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    film prisoners outside looked like kidnapped s...\n",
       "1    animated story victim family ww era keep short...\n",
       "2    going keep short name amazing art style camera...\n",
       "3    would given movie stars appropriately cut musi...\n",
       "4    mean come thought story great cop obviously wo...\n",
       "Name: text, dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df['text'] = train_df['review_text'].apply(lambda x: clean_text(x))\n",
    "test_df['text'] = test_df['review_text'].apply(lambda x: clean_text(x))\n",
    "train_df['text'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(type(train_df['is_spoiler'][0]))\n",
    "train_df['is_spoiler'] = train_df['is_spoiler'].apply(lambda x: 1 if x else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Creating the tfidf vector...\n",
      "\n",
      "(473913, 200)\n",
      "(100000, 200)\n"
     ]
    }
   ],
   "source": [
    "train_df = train_df.sample(frac=1).reset_index(drop=True)\n",
    "tfidf = TF(\n",
    "    analyzer=\"word\",\n",
    "    tokenizer=None,\n",
    "    preprocessor=None,\n",
    "    stop_words=None,\n",
    "    max_features=200)\n",
    "\n",
    "# 数据向量化\n",
    "print(\"Creating the tfidf vector...\\n\")\n",
    "tfidf.fit(train_df['text'])\n",
    "x_train = tfidf.transform(train_df['text'])\n",
    "x_train = x_train.toarray()\n",
    "\n",
    "x_test = tfidf.transform(test_df['text'])\n",
    "x_test = x_test.toarray()\n",
    "\n",
    "print(x_train.shape)\n",
    "print(x_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    349286\n",
       "1    124627\n",
       "Name: is_spoiler, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train = train_df['is_spoiler']\n",
    "y_train.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = LR(solver='liblinear')\n",
    "model.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10折交叉验证：\n",
      "0.7478672275924236\n"
     ]
    }
   ],
   "source": [
    "print(\"10折交叉验证：\")\n",
    "print(np.mean(cross_val_score(model, x_train, y_train, cv=10, scoring=\"accuracy\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  pred\n",
       "0   0     0\n",
       "1   1     0\n",
       "2   2     0\n",
       "3   3     0\n",
       "4   4     0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = model.predict(x_test)\n",
    "submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds})\n",
    "submission['id'] = submission['id']\n",
    "submission.to_csv(\"../data/ml_submission.csv\", index=False, header=False)\n",
    "submission.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
